Here, we will apply a k-nearest neighbor (KNN) algorithm to classify the scATAC cells to a given cell type category with the help of our training set, the Multiome experiment. Remember, that KNN works on a basic assumption that data points of similar categories are closer to each other.
library(Seurat)
library(Signac)
library(flexclust)
library(tidyverse)
library(plyr)
library(harmony)
library(class)
library(ggplot2)
library(reshape2)
set.seed(1234)
cell_type = "GCBC"
# Paths
path_to_obj <- str_c(
here::here("scATAC-seq/results/R_objects/level_4/"),
cell_type,
"/",
cell_type,
"_integrated_level_4.rds",
sep = ""
)
path_to_obj_RNA <- str_c(
here::here("scRNA-seq/3-clustering/4-level_4/"),
cell_type,
"/",
cell_type,
"_clustered_clean_level_4_with_final_clusters.rds",
sep = ""
)
path_to_level_4 <- here::here("scATAC-seq/results/R_objects/level_4/GCBC/")
path_to_save <- str_c(path_to_level_4, "GCBC_annotated_level_4.rds")
reduction <- "harmony"
dims <- 1:40
color_palette <- c("#1CFFCE", "#90AD1C", "#C075A6", "#85660D",
"#5A5156", "#AA0DFE", "#F8A19F", "#F7E1A0",
"#1C8356", "#FEAF16", "#822E1C", "#C4451C",
"#1CBE4F", "#325A9B", "#F6222E", "#FE00FA",
"#FBE426", "#16FF32", "black", "#3283FE",
"#B00068", "#DEA0FD", "#B10DA1", "#E4E1E3",
"#90AD1C", "#FE00FA", "#85660D", "#3B00FB",
"#822E1C", "coral2", "#1CFFCE", "#1CBE4F",
"#3283FE", "#FBE426", "#F7E1A0", "#325A9B",
"#2ED9FF", "#B5EFB5", "#5A5156", "#DEA0FD",
"#FEAF16", "#683B79", "#B10DA1", "#1C7F93",
"#F8A19F", "dark orange", "#FEAF16", "#FBE426",
"Brown")
seurat <- readRDS(path_to_obj_RNA)
seurat$level_5 <- revalue(seurat$cluster8_subcluster,
c("0"= "DZ/LZ",
"1_0"="DZ_Sphase_HistoneHigh",
"1_1"="DZ_G2M",
"2_0"="DZ-nonproliferative",
"2_1_0"="DZ-nonproliferative",
"2_1_1"="DZ-nonproliferative",
"3_0"="LZ",
"3_1"="LZ-DZ-re-entry",
"3_2"="LZ",
"4"="DZ_Sphase",
"5_0"="LZ-proliferative_BCL2A1neg",
"5_1"="LZ-proliferative_BCL2A1pos",
"6"="DZ_G2M",
"7_0"="MBC-like",
"7_1"="MBC-like",
"8_0"="MBC-like",
"8_1"="MBC-like",
"8_2"="MBC-like",
"9"="PC-precursors"))
tonsil_RNA_annotation <- seurat@meta.data %>%
rownames_to_column(var = "cell_barcode") %>%
dplyr::filter(assay == "multiome") %>%
dplyr::select("cell_barcode", "level_5")
head(tonsil_RNA_annotation)
## cell_barcode level_5
## 1 co7dzuup_xuczw9vc_AAACAGCCAAAGGTAC-1 LZ
## 2 co7dzuup_xuczw9vc_AAACAGCCAATAGCAA-1 DZ_G2M
## 3 co7dzuup_xuczw9vc_AAACAGCCATTAAGTC-1 DZ-nonproliferative
## 4 co7dzuup_xuczw9vc_AAACCGAAGCTTACTT-1 DZ_G2M
## 5 co7dzuup_xuczw9vc_AAACCGCGTATTCGCT-1 DZ_G2M
## 6 co7dzuup_xuczw9vc_AAACCGCGTTGTAAAC-1 DZ_Sphase
DimPlot(seurat,
group.by = "level_5",
cols = color_palette,
pt.size = 0.1)
seurat_ATAC <- readRDS(path_to_obj)
seurat_ATAC
## An object of class Seurat
## 270784 features across 21447 samples within 1 assay
## Active assay: peaks_macs (270784 features, 260336 variable features)
## 3 dimensional reductions calculated: umap, lsi, harmony
p1 <- DimPlot(seurat_ATAC,
pt.size = 0.1)
p1
Annotation level 1 for scATAC will be defined “a priori” as unannotated and the scRNA annotation will be transfered to the scATAC-multiome cells based on the same cell barcode.
tonsil_scATAC_df <- data.frame(cell_barcode = colnames(seurat_ATAC)[seurat_ATAC$assay == "scATAC"])
tonsil_scATAC_df$level_5 <- "unannotated"
df_all <- rbind(tonsil_RNA_annotation,tonsil_scATAC_df)
rownames(df_all) <- df_all$cell_barcode
df_all <- df_all[colnames(seurat_ATAC), ]
seurat_ATAC$level_5 <- df_all$level_5
seurat_ATAC@meta.data$annotation_prob <- 1
melt(table(seurat_ATAC$level_5))
## Var1 value
## 1 DZ_G2M 1416
## 2 DZ_Sphase 847
## 3 DZ_Sphase_HistoneHigh 1297
## 4 DZ-nonproliferative 1375
## 5 DZ/LZ 1537
## 6 LZ 654
## 7 LZ-DZ-re-entry 303
## 8 LZ-proliferative_BCL2A1neg 383
## 9 LZ-proliferative_BCL2A1pos 442
## 10 MBC-like 131
## 11 PC-precursors 179
## 12 unannotated 12883
table(is.na(seurat_ATAC$level_5))
##
## FALSE
## 21447
DimPlot(seurat_ATAC,
group.by = "level_5",
split.by = "assay",
cols = color_palette,
pt.size = 0.5)
Data splicing basically involves splitting the data set into training and testing data set.
reference_cells <- colnames(seurat_ATAC)[seurat_ATAC$assay == "multiome"]
query_cells <- colnames(seurat_ATAC)[seurat_ATAC$assay == "scATAC"]
reduction_ref <- seurat_ATAC@reductions[[reduction]]@cell.embeddings[reference_cells, dims]
reduction_query <- seurat_ATAC@reductions[[reduction]]@cell.embeddings[query_cells, dims]
We’re going to calculate the number of observations in the training dataset that correspond to the Multiome data. The reason we’re doing this is that we want to initialize the value of ‘K’ in the KNN model. To do that, we split our training data in two part: a train.loan, that correspond to the random selection of the 70% of the training data and the test.loan, that is the remaining 30% of the data set. The first one is used to traint the system while the second is uses to evaluate the learned system.
dat.d <- sample(1:nrow(reduction_ref),
size=nrow(reduction_ref)*0.7,replace = FALSE)
train.loan <- reduction_ref[dat.d,] # 70% training data
test.loan <- reduction_ref[-dat.d,] # remaining 30% test data
train.loan_labels <- seurat_ATAC@meta.data[row.names(train.loan),]$level_5
test.loan_labels <- seurat_ATAC@meta.data[row.names(test.loan),]$level_5
k.optm <- c()
k.values <- c()
for (i in c(2,4,8,10,12,14,16,32,64,128)){
print(i)
knn.mod <- knn(train=train.loan, test=test.loan, cl=train.loan_labels, k=i)
k.optm <- c(k.optm, 100 * sum(test.loan_labels == knn.mod)/NROW(test.loan_labels))
k.values <- c(k.values,i)
}
## [1] 2
## [1] 4
## [1] 8
## [1] 10
## [1] 12
## [1] 14
## [1] 16
## [1] 32
## [1] 64
## [1] 128
Now we can plot the accuracy of the model taking in account a range of different K and selec the best one.
k.optim = data.frame(k.values,k.optm)
p3 <- ggplot(data=k.optim, aes(x=k.values, y=k.optm, group=1)) +
geom_line() +
geom_point() +
geom_vline(xintercept=14, linetype="dashed", color = "red")
p3
train.loan <- reduction_ref
test.loan <- reduction_query
train.loan_labels <- seurat_ATAC@meta.data[row.names(train.loan),]$level_5
test.loan_labels <- seurat_ATAC@meta.data[row.names(test.loan),]$level_5
knn.mod <- knn(train=train.loan, test=test.loan, cl=train.loan_labels, k=14, prob=T)
annotation_data <- data.frame(query_cells, knn.mod, attr(knn.mod,"prob"))
colnames(annotation_data) <- c("query_cells",
"level_5",
"annotation_prob")
annotation_data$level_5 <- as.character(annotation_data$level_5)
seurat_ATAC@meta.data[annotation_data$query_cells,]$level_5 <- annotation_data$level_5
seurat_ATAC@meta.data[annotation_data$query_cells,]$annotation_prob <- annotation_data$annotation_prob
seurat_ATAC$level_5 <- factor(seurat_ATAC$level_5)
DimPlot(
seurat_ATAC,
cols = color_palette,
group.by = "level_5",
pt.size = 0.1)
DimPlot(
cols = color_palette,
seurat_ATAC,
group.by = "level_5",
pt.size = 0.1, split.by = "assay")
melt(table(seurat_ATAC$level_5))
## Var1 value
## 1 DZ_G2M 3305
## 2 DZ_Sphase 1987
## 3 DZ_Sphase_HistoneHigh 2904
## 4 DZ-nonproliferative 4094
## 5 DZ/LZ 5184
## 6 LZ 1320
## 7 LZ-DZ-re-entry 680
## 8 LZ-proliferative_BCL2A1neg 535
## 9 LZ-proliferative_BCL2A1pos 733
## 10 MBC-like 179
## 11 PC-precursors 526
saveRDS(seurat_ATAC, path_to_save)
Note that the probability of the prediction was lower in the transitioning cells and in not-defined clusters.
seurat_ATAC_scATAC = subset(seurat_ATAC, assay == "scATAC")
FeaturePlot(
seurat_ATAC_scATAC, reduction = "umap",
features = "annotation_prob",
pt.size = 0.1)
sessionInfo()
## R version 4.0.3 (2020-10-10)
## Platform: x86_64-apple-darwin13.4.0 (64-bit)
## Running under: macOS Big Sur 10.16
##
## Matrix products: default
## BLAS/LAPACK: /Users/pauli/opt/anaconda3/envs/Tonsil_atlas/lib/libopenblasp-r0.3.10.dylib
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## attached base packages:
## [1] stats4 grid stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] reshape2_1.4.4 class_7.3-18 harmony_1.0 Rcpp_1.0.6 plyr_1.8.6 forcats_0.5.0 stringr_1.4.0 dplyr_1.0.7 purrr_0.3.4 readr_1.4.0 tidyr_1.1.3 tibble_3.1.2 ggplot2_3.3.5 tidyverse_1.3.0 flexclust_1.4-0 modeltools_0.2-23 lattice_0.20-41 Signac_1.2.1.9003 SeuratObject_4.0.2 Seurat_4.0.3 BiocStyle_2.16.1
##
## loaded via a namespace (and not attached):
## [1] readxl_1.3.1 backports_1.2.0 fastmatch_1.1-0 igraph_1.2.6 lazyeval_0.2.2 splines_4.0.3 BiocParallel_1.22.0 listenv_0.8.0 scattermore_0.7 SnowballC_0.7.0 GenomeInfoDb_1.24.0 digest_0.6.27 htmltools_0.5.1.1 fansi_0.5.0 magrittr_2.0.1 tensor_1.5 cluster_2.1.0 ROCR_1.0-11 globals_0.14.0 Biostrings_2.56.0 modelr_0.1.8 matrixStats_0.59.0 docopt_0.7.1 spatstat.sparse_2.0-0 colorspace_2.0-2 rvest_0.3.6 blob_1.2.1 ggrepel_0.9.1 haven_2.3.1 xfun_0.18 sparsesvd_0.2 crayon_1.4.1 RCurl_1.98-1.2 jsonlite_1.7.2 spatstat.data_2.1-0 survival_3.2-7 zoo_1.8-9 glue_1.4.2 polyclip_1.10-0 gtable_0.3.0 zlibbioc_1.34.0 XVector_0.28.0 leiden_0.3.8 future.apply_1.7.0 BiocGenerics_0.36.1 abind_1.4-5 scales_1.1.1 DBI_1.1.0 miniUI_0.1.1.1 viridisLite_0.4.0 xtable_1.8-4
## [52] reticulate_1.20 spatstat.core_2.2-0 htmlwidgets_1.5.3 httr_1.4.2 RColorBrewer_1.1-2 ellipsis_0.3.2 ica_1.0-2 pkgconfig_2.0.3 farver_2.1.0 dbplyr_1.4.4 ggseqlogo_0.1 uwot_0.1.10.9000 deldir_0.2-10 here_1.0.1 utf8_1.2.1 labeling_0.4.2 tidyselect_1.1.1 rlang_0.4.11 later_1.2.0 cellranger_1.1.0 munsell_0.5.0 tools_4.0.3 cli_3.0.0 generics_0.1.0 broom_0.7.2 ggridges_0.5.3 evaluate_0.14 fastmap_1.1.0 yaml_2.2.1 goftest_1.2-2 fs_1.5.0 knitr_1.30 fitdistrplus_1.1-5 RANN_2.6.1 pbapply_1.4-3 future_1.21.0 nlme_3.1-150 mime_0.11 slam_0.1-48 RcppRoll_0.3.0 xml2_1.3.2 rstudioapi_0.12 compiler_4.0.3 plotly_4.9.4.1 png_0.1-7 spatstat.utils_2.2-0 reprex_0.3.0 tweenr_1.0.2 stringi_1.6.2 Matrix_1.3-4 vctrs_0.3.8
## [103] pillar_1.6.1 lifecycle_1.0.0 BiocManager_1.30.10 spatstat.geom_2.2-0 lmtest_0.9-38 RcppAnnoy_0.0.18 data.table_1.14.0 cowplot_1.1.1 bitops_1.0-7 irlba_2.3.3 httpuv_1.6.1 patchwork_1.1.1 GenomicRanges_1.40.0 R6_2.5.0 bookdown_0.21 promises_1.2.0.1 KernSmooth_2.23-17 gridExtra_2.3 lsa_0.73.2 IRanges_2.22.1 parallelly_1.26.1 codetools_0.2-17 MASS_7.3-53 assertthat_0.2.1 rprojroot_2.0.2 withr_2.4.2 qlcMatrix_0.9.7 sctransform_0.3.2 Rsamtools_2.4.0 S4Vectors_0.26.0 GenomeInfoDbData_1.2.3 hms_0.5.3 mgcv_1.8-33 parallel_4.0.3 rpart_4.1-15 rmarkdown_2.5 Rtsne_0.15 ggforce_0.3.3 lubridate_1.7.9 shiny_1.6.0